Starting form the filtered table from ‘HMP_coverage.Rmd’. Run a series of analysis to look at relationships between body site and subjects.
library(reshape2)
#library(igraph)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(biomod2)
library(e1071)
library(RColorBrewer)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-0
library(assertthat)
source('./staph_metagenome_tools.R')
dat4 <- read.table("./Data/combined")
#list of all subjects with more than one sample
multiSubjects <- count(dat4,Subject.Id) %>% filter(n > 1) %>% select(Subject.Id )
dat5 <- make_subtype_matrix(dat4)
#create Hamming dist matrices with and without cutof min value of 0.2
dat6 <- make_subtype_matrix(dat4) %>% bintr(0.2) %>% hamming.distance %>% data.frame
dat8 <- make_subtype_matrix(dat4) %>% hamming.distance %>% data.frame
test for significant associations of subtype with with bodysite and subject. us e Hamming dist. matrix. Two levels, one with a beta cutoff for all samples > 0.2 and one without
set.seed(344098)
run_bs_subj_adonis(dat6,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.1036 0.084895 3.3775 0.1187 0.001 ***
## Residuals 326 8.1943 0.025136 0.8813
## Total 339 9.2979 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 15.51 10.95
## buccal mucosa hard palate
## 14.54 0.00
## left retroauricular crease palatine tonsil
## 15.97 13.02
## posterior fornnix right antecubital fossa
## 10.02 0.00
## right retroauricular crease saliva
## 18.97 0.00
## stool subgingival_plaque
## 10.62 0.00
## supragingival plaque tongue dorsum
## 14.37 17.01
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 41603.032 37193.311 8469.370 3596.008 2390.837 1979.686 1506.512
## PCoA8
## 1166.622
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 2197.755 169.05808 5.134462 999 0.001
## Residuals 326 10733.925 32.92615 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 1 0.1238 0.123823 4.562 0.01332 0.008 **
## Residuals 338 9.1741 0.027142 0.98668
## Total 339 9.2979 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 9.220e+00 1.217e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.039e+01
## 8 9 10 11 12 13 14
## 2.598e+00 7.036e+00 2.439e+01 1.187e+01 1.574e+01 1.362e+01 0.000e+00
## 15 16 17 18 19 20 21
## 1.765e+01 0.000e+00 0.000e+00 1.102e+01 2.291e+01 0.000e+00 9.220e+00
## 22 23 24 25 26 27 28
## 1.196e+01 0.000e+00 0.000e+00 1.511e+01 7.036e+00 1.183e+01 1.320e+01
## 29 30 31 32 33 34 35
## 1.111e+01 1.005e+01 1.044e+01 1.598e+01 1.533e+01 1.187e+01 1.312e+01
## 36 37 38 39 40 41 42
## 0.000e+00 1.715e+01 4.899e+00 1.604e+01 1.675e+01 1.540e+01 1.114e+01
## 43 44 45 46 47 48 49
## 1.143e+01 6.896e+00 0.000e+00 0.000e+00 0.000e+00 9.381e+00 0.000e+00
## 50 51 52 53 54 55 56
## 0.000e+00 0.000e+00 8.130e-14 0.000e+00 0.000e+00 1.334e+01 1.922e+01
## 57 58 59 60 61 62 63
## 1.217e+01 0.000e+00 0.000e+00 1.535e+01 0.000e+00 0.000e+00 1.285e+01
## 64 65 66 67 68 69 70
## 0.000e+00 1.407e+01 1.142e+01 4.372e+00 1.409e+01 1.446e+01 1.068e+01
## 71 72 73 74 75 76 77
## 8.138e+00 1.722e+01 1.702e+01 9.440e+00 6.146e+00 7.868e+00 1.221e+01
## 78 79 80 81 82 83 84
## 8.599e+00 2.082e-14 1.150e+01 1.572e+01 1.897e-14 2.273e-14 2.398e+00
## 85 86 87 88 89 90 91
## 0.000e+00 1.176e+01 1.407e+01 0.000e+00 0.000e+00 1.764e+01 1.149e+01
## 92 93 94 95 96 97 98
## 1.170e+01 1.269e+01 1.926e+01 1.633e+01 0.000e+00 1.376e+01 0.000e+00
## 99 100 101 102 103 104 105
## 0.000e+00 1.187e+01 0.000e+00 1.072e+01 1.130e+01 0.000e+00 0.000e+00
## 106 107 108 109 110
## 1.340e+01 1.025e+01 1.407e+01 0.000e+00 9.220e+00
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 41603.032 37193.311 8469.370 3596.008 2390.837 1979.686 1506.512
## PCoA8
## 1166.622
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 109 11149.32 102.28732 1.188271 999 0.163
## Residuals 230 19798.59 86.08081 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.1036 0.084895 3.4337 0.11870 0.001 ***
## subj_vec 1 0.1589 0.158933 6.4283 0.01709 0.003 **
## Residuals 325 8.0353 0.024724 0.86421
## Total 339 9.2979 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
run_bs_subj_adonis(dat8,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.7003 0.207713 12.517 0.33295 0.001 ***
## Residuals 326 5.4098 0.016594 0.66705
## Total 339 8.1100 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 282
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 56.14 13.70
## buccal mucosa hard palate
## 16.22 0.00
## left retroauricular crease palatine tonsil
## 37.07 19.66
## posterior fornnix right antecubital fossa
## 11.57 0.00
## right retroauricular crease saliva
## 31.96 0.00
## stool subgingival_plaque
## 20.52 0.00
## supragingival plaque tongue dorsum
## 15.89 25.00
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 937128.241 22487.052 15107.447 7151.909 4346.681 3264.424
## PCoA7 PCoA8
## 2801.537 2365.001
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 85529.29 6579.176 4.764105 999 0.028
## Residuals 326 450202.43 1380.989 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 1 0.4003 0.40029 17.549 0.04936 0.001 ***
## Residuals 338 7.7097 0.02281 0.95064
## Total 339 8.1100 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 282
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 1.583e+01 2.864e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 6.129e+01
## 8 9 10 11 12 13 14
## 8.400e+01 2.371e+01 5.065e+01 2.262e+01 5.596e+01 6.132e+01 0.000e+00
## 15 16 17 18 19 20 21
## 2.793e+01 0.000e+00 0.000e+00 4.584e+01 2.476e+01 0.000e+00 1.154e+01
## 22 23 24 25 26 27 28
## 1.040e+01 0.000e+00 0.000e+00 4.128e+01 1.480e+01 3.242e+01 2.898e+01
## 29 30 31 32 33 34 35
## 1.987e+01 2.840e+01 1.840e+02 3.448e+01 1.796e+01 9.816e+01 4.058e+01
## 36 37 38 39 40 41 42
## 0.000e+00 5.659e+01 3.433e+01 4.281e+01 2.522e+01 3.329e+01 5.673e+01
## 43 44 45 46 47 48 49
## 1.349e+02 7.548e+01 0.000e+00 0.000e+00 0.000e+00 6.110e+00 0.000e+00
## 50 51 52 53 54 55 56
## 0.000e+00 0.000e+00 4.636e+01 0.000e+00 0.000e+00 2.038e+01 3.132e+01
## 57 58 59 60 61 62 63
## 3.035e+01 0.000e+00 0.000e+00 2.086e+01 0.000e+00 0.000e+00 3.096e+01
## 64 65 66 67 68 69 70
## 0.000e+00 1.790e+01 1.863e+01 4.031e+01 1.315e+01 1.491e+01 1.939e+01
## 71 72 73 74 75 76 77
## 1.895e+01 2.015e+01 2.220e+01 1.800e+01 1.559e+01 2.402e+01 1.653e+01
## 78 79 80 81 82 83 84
## 8.463e+00 1.740e+01 1.535e+01 2.122e+01 2.757e-12 4.792e-12 2.291e+01
## 85 86 87 88 89 90 91
## 0.000e+00 1.888e+01 9.165e+00 0.000e+00 0.000e+00 1.204e+01 1.028e+01
## 92 93 94 95 96 97 98
## 3.843e+01 1.105e+01 2.403e+01 1.682e+01 0.000e+00 6.451e+01 0.000e+00
## 99 100 101 102 103 104 105
## 0.000e+00 1.427e+01 0.000e+00 2.148e+01 1.262e+01 0.000e+00 0.000e+00
## 106 107 108 109 110
## 2.277e+01 1.124e+01 9.165e+00 0.000e+00 1.563e+01
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 937128.241 22487.052 15107.447 7151.909 4346.681 3264.424
## PCoA7 PCoA8
## 2801.537 2365.001
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 109 205874.1 1888.754 1.247548 999 0.216
## Residuals 230 348213.8 1513.973 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.7003 0.207713 12.911 0.33295 0.001 ***
## subj_vec 1 0.1813 0.181274 11.268 0.02235 0.001 ***
## Residuals 325 5.2285 0.016088 0.64469
## Total 339 8.1100 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test ffor whether intra-subject distance greater than intersubject
subject_perm(dat4,multiSubjects,dat6)
## Score for intraperson hits = 1499
## [1] "Quartlies for random distribution"
## 0% 25% 50% 75% 100%
## 1555 1649 1668 1687 1773
## Empirical p value = 0
#now look at the same test between body sites
bs <- levels(dat4$Body.site)
by_factor_perm(bs,dat4,dat6)
## [1] "anterior nares"
## [1] "Number of samples " "79"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 5848 6804 7028 7254 8150
## [1] 6980
## Empirical p value [1] 0.4431
##
## [1] "attached keratinized gingiva"
## [1] "Number of samples " "4"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 0 12 13 16 30
## [1] 8
## Empirical p value [1] 0.056
##
## [1] "buccal mucosa"
## [1] "Number of samples " "59"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 3076 3748 3894 4046 4870
## [1] 3192
## Empirical p value [1] 3e-04
##
## Zero samples in hard palate[1] "left retroauricular crease"
## [1] "Number of samples " "23"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 376 538 574 614 782
## [1] 636
## Empirical p value [1] 0.8603
##
## [1] "palatine tonsil"
## [1] "Number of samples " "6"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 5 29 34 39 63
## [1] 36
## Empirical p value [1] 0.6266
##
## [1] "posterior fornnix"
## [1] "Number of samples " "11"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 48 114 126 138 190
## [1] 76
## Empirical p value [1] 0.0051
##
## Zero samples in right antecubital fossa[1] "right retroauricular crease"
## [1] "Number of samples " "28"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 549 810 860 912 1127
## [1] 1081
## Empirical p value [1] 0.9978
##
## Zero samples in saliva[1] "stool"
## [1] "Number of samples " "7"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6 42 48 54 82
## [1] 30
## Empirical p value [1] 0.0334
##
## Zero samples in subgingival_plaque[1] "supragingival plaque"
## [1] "Number of samples " "37"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 1060 1442 1518 1594 1924
## [1] 1164
## Empirical p value [1] 8e-04
##
## [1] "tongue dorsum"
## [1] "Number of samples " "82"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6105 7338 7561 7797 8843
## [1] 7815
## Empirical p value [1] 0.7673
presence_mat <- as.data.frame(bintr(dat5,0.2))
top_score_mat <- as.data.frame(bintr(dat5,0.5))
# png("~/Dropbox/ARTICLES_BY_TDR/2015-staph-metagenome/HMP_barchart.png",width=640, height =640, res = 75)
# dev.off()
genotypes_plot(presence_mat,"All samples, subtypes present > 0.2")
genotypes_plot(top_score_mat,"All samples, subtypes present > 0.5")
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(presence_mat,bss_rows)
genotypes_plot(bs_df,paste("Present: ", i))
}
}
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(top_score_mat,bss_rows)
genotypes_plot(bs_df,paste("Top score: ", i))
}
}
### PCA
par(mfrow=c(2,2))
pcobj <- prcomp(dat6)
tr_gray <- rgb(0.5,.5,.5,.15)
for (i in bs) {
prcols <- rep(tr_gray,nrow(dat6))
prcols[which(dat4$Body.site == i)] <- "red"
plot(pcobj$x,col = prcols, pch = 16, main = i)
}
for (i in multiSubjects$Subject.Id) {
sub_rows = which(dat4$Subject.Id == as.character(i))
if (length(sub_rows) > 3){
prcols <- rep(tr_gray,nrow(dat6))
prcols[sub_rows] <- "blue"
plot(pcobj$x,col = prcols, pch = 16, main = c("Subject",i))
}
}
### Session Info
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] assertthat_0.1 vegan_2.3-0 lattice_0.20-33
## [4] permute_0.8-4 gdata_2.17.0 RColorBrewer_1.1-2
## [7] e1071_1.6-7 dplyr_0.4.2 reshape2_1.4.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.0 cluster_2.0.3 knitr_1.10.5 magrittr_1.5
## [5] MASS_7.3-43 R6_2.1.0 stringr_1.0.0 plyr_1.8.3
## [9] tools_3.2.1 parallel_3.2.1 grid_3.2.1 nlme_3.1-121
## [13] mgcv_1.8-7 DBI_0.3.1 htmltools_0.2.6 class_7.3-13
## [17] gtools_3.5.0 lazyeval_0.1.10 yaml_2.1.13 digest_0.6.8
## [21] Matrix_1.2-2 evaluate_0.7 rmarkdown_0.7 stringi_0.5-5